import os
import logging as log
import warnings
import matplotlib.pyplot as plt
import sys, os
from util.commons import *
from util.ui import *
from util.model import *
from util.split import *
from util.dataset import *
from IPython.display import display, HTML
In this notebook a dataset named 'Risk Factors for Cervical Cancer'. The dataset was collected at 'Hospital Universitario de Caracas' in Caracas, Venezuela. The dataset comprises demographic information, habits, and historic medical records of 858 patients. Several patients decided not to answer some of the questions because of privacy concerns (missing values).
dataset, msg = get_dataset('cervical_cancer')
display(msg)
display(dataset.df)
"Dataset 'cervical_cancer (Risk Factors for Cervical Cancer)' loaded successfully. For further information about this dataset please visit: https://archive.ics.uci.edu/ml/datasets/Cervical+cancer+%28Risk+Factors%29#"
| Age | Number of sexual partners | First sexual intercourse | Num of pregnancies | Smokes | Smokes (years) | Smokes (packs/year) | Hormonal Contraceptives | Hormonal Contraceptives (years) | IUD | IUD (years) | STDs | STDs (number) | STDs:condylomatosis | STDs:cervical condylomatosis | STDs:vaginal condylomatosis | STDs:vulvo-perineal condylomatosis | STDs:syphilis | STDs:pelvic inflammatory disease | STDs:genital herpes | STDs:molluscum contagiosum | STDs:AIDS | STDs:HIV | STDs:Hepatitis B | STDs:HPV | STDs: Number of diagnosis | STDs: Time since first diagnosis | STDs: Time since last diagnosis | Dx:Cancer | Dx:CIN | Dx:HPV | Dx | Hinselmann | Schiller | Citology | Biopsy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 18 | 4.0 | 15.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | ? | ? | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 15 | 1.0 | 14.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | ? | ? | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 34 | 1.0 | ? | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | ? | ? | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 52 | 5.0 | 16.0 | 4.0 | 1.0 | 37.0 | 37.0 | 1.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | ? | ? | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 4 | 46 | 3.0 | 21.0 | 4.0 | 0.0 | 0.0 | 0.0 | 1.0 | 15.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | ? | ? | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 853 | 34 | 3.0 | 18.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | ? | ? | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 854 | 32 | 2.0 | 19.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 8.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | ? | ? | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 855 | 25 | 2.0 | 17.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.08 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | ? | ? | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 856 | 33 | 2.0 | 24.0 | 2.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.08 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | ? | ? | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 857 | 29 | 2.0 | 20.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | ? | ? | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
858 rows × 36 columns
The dataset will be used same as described here: https://christophm.github.io/interpretable-ml-book/cervical.html All unknown values (\?) are going to be set to 0.0.
df = dataset.df.drop(columns=['Smokes (packs/year)', 'STDs:condylomatosis', 'STDs:cervical condylomatosis', 'STDs:genital herpes',
'STDs:Hepatitis B', 'STDs:vulvo-perineal condylomatosis', 'Dx:HPV',
'STDs:molluscum contagiosum', 'STDs:syphilis', 'STDs:AIDS', 'Hinselmann',
'STDs:pelvic inflammatory disease', 'STDs:HPV', 'Dx:CIN', 'Dx', 'STDs:HIV',
'Schiller', 'STDs:vaginal condylomatosis', 'Dx:Cancer', 'Citology'], axis=1)
num_cols = ['Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 'Smokes',
'Smokes (years)', 'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD',
'IUD (years)', 'STDs', 'STDs (number)', 'STDs: Time since first diagnosis',
'STDs: Time since last diagnosis']
df = normalize_undefined_values('?', df)
str_limit = 5
for col in df.columns:
if col in num_cols and len(df[col].unique()) > str_limit:
df[col] = df[col].astype('float')
elif col in num_cols and len(df[col].unique()) <= str_limit:
df[col] = df[col].astype(str)
df
| Age | Number of sexual partners | First sexual intercourse | Num of pregnancies | Smokes | Smokes (years) | Hormonal Contraceptives | Hormonal Contraceptives (years) | IUD | IUD (years) | STDs | STDs (number) | STDs: Number of diagnosis | STDs: Time since first diagnosis | STDs: Time since last diagnosis | Biopsy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 18 | 4.0 | 15.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0 |
| 1 | 15 | 1.0 | 14.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0 |
| 2 | 34 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0 |
| 3 | 52 | 5.0 | 16.0 | 4.0 | 1.0 | 37.0 | 1.0 | 3.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0 |
| 4 | 46 | 3.0 | 21.0 | 4.0 | 0.0 | 0.0 | 1.0 | 15.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 853 | 34 | 3.0 | 18.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0 |
| 854 | 32 | 2.0 | 19.0 | 1.0 | 0.0 | 0.0 | 1.0 | 8.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0 |
| 855 | 25 | 2.0 | 17.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.08 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0 |
| 856 | 33 | 2.0 | 24.0 | 2.0 | 0.0 | 0.0 | 1.0 | 0.08 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0 |
| 857 | 29 | 2.0 | 20.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.50 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0 |
858 rows × 16 columns
Three visualization functions offered by the XAI module will be used for analyzing the dataset.
import xai
%matplotlib inline
plt.style.use('ggplot')
warnings.filterwarnings('ignore')
imbalanced_cols = ['Biopsy']
xai.imbalance_plot(df, *imbalanced_cols, categorical_cols=['Biopsy'])
_ = xai.correlations(df, include_categorical=True, plot_type="matrix", plt_kwargs={'figsize': (6, 6)})
_ = xai.correlations(df, include_categorical=True, plt_kwargs={'figsize': (8, 6)})
In the cell below the target variable is selected. The Biopsy serves as the gold standard for diagnosing cervical cancer, therefore we will use it as target.
df_X, df_y, msg = split_feature_target(df, "Biopsy")
df_y
21-Oct-21 14:41:29 - Target 'Biopsy' selected successfully.
0 0
1 0
2 0
3 0
4 0
..
853 0
854 0
855 0
856 0
857 0
Name: Biopsy, Length: 858, dtype: int64
Four models are going to be trained on this dataset. In the output below we can see accuracy, classification reports, confusion matrix and ROC Curve for each model.
# Create empty models
initial_models, msg = fill_empty_models(df_X, df_y, 4)
models = []
model1 = initial_models[0]
msg = fill_model(model1, Algorithm.LOGISTIC_REGRESSION, Split(SplitTypes.NORMAL))
models.append(model1)
model_1 = models[0]
21-Oct-21 14:41:29 - Model accuracy: 0.7325581395348837
21-Oct-21 14:41:29 - Classification report:
precision recall f1-score support
0 0.95 0.75 0.84 241
1 0.12 0.47 0.19 17
accuracy 0.73 258
macro avg 0.54 0.61 0.51 258
weighted avg 0.90 0.73 0.80 258
21-Oct-21 14:41:30 - Model Model 1 trained successfully!
model2 = initial_models[1]
msg = fill_model(model2, Algorithm.DECISION_TREE, Split(SplitTypes.NORMAL))
models.append(model2)
model_2 = models[1]
21-Oct-21 14:41:30 - Model accuracy: 0.8643410852713178
21-Oct-21 14:41:30 - Classification report:
precision recall f1-score support
0 0.94 0.92 0.93 241
1 0.09 0.12 0.10 17
accuracy 0.86 258
macro avg 0.51 0.52 0.51 258
weighted avg 0.88 0.86 0.87 258
21-Oct-21 14:41:30 - Model Model 2 trained successfully!
model3 = initial_models[2]
msg = fill_model(model3, Algorithm.RANDOM_FOREST, Split(SplitTypes.NORMAL))
models.append(model3)
model_3 = models[2]
21-Oct-21 14:41:31 - Model accuracy: 0.9147286821705426
21-Oct-21 14:41:31 - Classification report:
precision recall f1-score support
0 0.93 0.98 0.96 241
1 0.00 0.00 0.00 17
accuracy 0.91 258
macro avg 0.47 0.49 0.48 258
weighted avg 0.87 0.91 0.89 258
21-Oct-21 14:41:31 - Model Model 3 trained successfully!
model4 = initial_models[3]
msg = fill_model(model4, Algorithm.SVC, Split(SplitTypes.NORMAL))
models.append(model4)
model_4 = models[3]
21-Oct-21 14:41:32 - Model accuracy: 0.8565891472868217
21-Oct-21 14:41:32 - Classification report:
precision recall f1-score support
0 0.94 0.90 0.92 241
1 0.14 0.24 0.18 17
accuracy 0.86 258
macro avg 0.54 0.57 0.55 258
weighted avg 0.89 0.86 0.87 258
21-Oct-21 14:41:33 - Model Model 4 trained successfully!
In the following steps we will use global interpretation techniques that help us to answer questions like how does a model behave in general? What features drive predictions and what features are completely useless. This data may be very important in understanding the model better. Most of the techniques work by investigating the conditional interactions between the target variable and the features on the complete dataset.
The importance of a feature is the increase in the prediction error of the model after we permuted the feature’s values, which breaks the relationship between the feature and the true outcome. A feature is “important” if permuting it increases the model error. This is because in that case, the model relied heavily on this feature for making right prediction. On the other hand, a feature is “unimportant” if permuting it doesn’t affect the error by much or doesn’t change it at all.
In the first case, we use ELI5, which does not permute the features but only visualizes the weight of each feature.
plot = generate_feature_importance_plot(FeatureImportanceType.ELI5, model_1)
display(plot)
21-Oct-21 14:41:33 - Generating a feature importance plot using ELI5 for Model 1 ...
y=1 top features
| Weight? | Feature |
|---|---|
| +1.278 | STDs: Number of diagnosis |
| +0.481 | STDs_1.0 |
| +0.460 | IUD_1.0 |
| +0.454 | Smokes_0.0 |
| +0.389 | STDs: Time since last diagnosis |
| +0.110 | Hormonal Contraceptives_1.0 |
| +0.055 | Smokes (years) |
| +0.036 | Hormonal Contraceptives (years) |
| +0.013 | Age |
| +0.008 | First sexual intercourse |
| -0.014 | Number of sexual partners |
| -0.023 | IUD (years) |
| -0.031 | <BIAS> |
| -0.095 | Num of pregnancies |
| -0.141 | Hormonal Contraceptives_0.0 |
| -0.422 | STDs: Time since first diagnosis |
| -0.485 | Smokes_1.0 |
| -0.491 | IUD_0.0 |
| -0.512 | STDs_0.0 |
| -0.573 | STDs (number) |
plot = generate_feature_importance_plot(FeatureImportanceType.ELI5, model_2)
display(plot)
21-Oct-21 14:41:33 - Generating a feature importance plot using ELI5 for Model 2 ...
| Weight | Feature |
|---|---|
| 0.2598 | Age |
| 0.2039 | Hormonal Contraceptives (years) |
| 0.1629 | First sexual intercourse |
| 0.1375 | Num of pregnancies |
| 0.1082 | Number of sexual partners |
| 0.0517 | STDs: Time since last diagnosis |
| 0.0256 | IUD (years) |
| 0.0187 | Hormonal Contraceptives_1.0 |
| 0.0119 | Smokes_0.0 |
| 0.0098 | Smokes (years) |
| 0.0063 | Smokes_1.0 |
| 0.0031 | STDs (number) |
| 0.0005 | Hormonal Contraceptives_0.0 |
| 0.0000 | IUD_1.0 |
| 0.0000 | IUD_0.0 |
| 0 | STDs_1.0 |
| 0 | STDs: Number of diagnosis |
| 0 | STDs_0.0 |
| 0 | STDs: Time since first diagnosis |
plot = generate_feature_importance_plot(FeatureImportanceType.ELI5, model_3)
display(plot)
21-Oct-21 14:41:33 - Generating a feature importance plot using ELI5 for Model 3 ...
| Weight | Feature |
|---|---|
| 0.2155 ± 0.1369 | Age |
| 0.1543 ± 0.1293 | Number of sexual partners |
| 0.1402 ± 0.0677 | Hormonal Contraceptives (years) |
| 0.1222 ± 0.1038 | First sexual intercourse |
| 0.1217 ± 0.0771 | Num of pregnancies |
| 0.0652 ± 0.0734 | IUD (years) |
| 0.0322 ± 0.0337 | Smokes (years) |
| 0.0236 ± 0.0584 | STDs: Time since first diagnosis |
| 0.0194 ± 0.0274 | Hormonal Contraceptives_0.0 |
| 0.0191 ± 0.0284 | Smokes_0.0 |
| 0.0185 ± 0.0234 | Hormonal Contraceptives_1.0 |
| 0.0176 ± 0.0388 | STDs (number) |
| 0.0165 ± 0.0336 | Smokes_1.0 |
| 0.0134 ± 0.0228 | STDs: Time since last diagnosis |
| 0.0097 ± 0.0120 | STDs: Number of diagnosis |
| 0.0078 ± 0.0224 | IUD_1.0 |
| 0.0025 ± 0.0062 | IUD_0.0 |
| 0.0006 ± 0.0021 | STDs_0.0 |
| 0 ± 0.0000 | STDs_1.0 |
plot = generate_feature_importance_plot(FeatureImportanceType.ELI5, model_4)
display(plot)
21-Oct-21 14:41:33 - Generating a feature importance plot using ELI5 for Model 4 ... 21-Oct-21 14:41:33 - SVC not is supported by FeatureImportanceType.ELI5.
None
print(generate_feature_importance_explanation(FeatureImportanceType.ELI5, models, 4))
21-Oct-21 14:41:33 - Generating feature importance explanation for ELI5 ... 21-Oct-21 14:41:33 - SVC not supported for ELI5 explanations.
Summary: The highest feature for Model 1 is STDs: Number of diagnosis with weight ~1.278. The 2nd most valuable feature for Model 1 is STDs_1.0 with weight ~0.481. The 3rd best feature for Model 1 is IUD_1.0 with weight ~0.46. The 4th best feature for Model 1 is Smokes_0.0 with weight ~0.454. The highest feature for Model 2 is Age with weight ~0.26. The 2nd best feature for Model 2 is Hormonal Contraceptives (years) with weight ~0.204. The 3rd most influential feature for Model 2 is First sexual intercourse with weight ~0.163. The 4th most important feature for Model 2 is Num of pregnancies with weight ~0.137. The best feature for Model 3 is Age with weight ~0.215, matching 1st for Model 2. The 2nd most important feature for Model 3 is Number of sexual partners with weight ~0.154. The 3rd highest feature for Model 3 is Hormonal Contraceptives (years) with weight ~0.14, matching 2nd for Model 2. The 4th most influential feature for Model 3 is First sexual intercourse with weight ~0.122, identical to 3rd for Model 2.
%matplotlib inline
plt.rcParams['figure.figsize'] = [14, 15]
plt.style.use('ggplot')
warnings.filterwarnings('ignore')
_ = generate_feature_importance_plot(FeatureImportanceType.SKATER, model_1)
21-Oct-21 14:41:33 - Generating a feature importance plot using SKATER for Model 1 ... 21-Oct-21 14:41:34 - Initializing Skater - generating new in-memory model. This operation may be time-consuming so please be patient. 2021-10-21 14:41:34,410 - skater.core.explanations - WARNING - Progress bars slow down runs by 10-20%. For slightly faster runs, do progress_bar=False
[19/19] features ████████████████████ Time elapsed: 1 seconds
_ = generate_feature_importance_plot(FeatureImportanceType.SKATER, model_2)
21-Oct-21 14:41:36 - Generating a feature importance plot using SKATER for Model 2 ... 21-Oct-21 14:41:36 - Initializing Skater - generating new in-memory model. This operation may be time-consuming so please be patient. 2021-10-21 14:41:37,037 - skater.core.explanations - WARNING - Progress bars slow down runs by 10-20%. For slightly faster runs, do progress_bar=False
[19/19] features ████████████████████ Time elapsed: 1 seconds
_ = generate_feature_importance_plot(FeatureImportanceType.SKATER, model_3)
21-Oct-21 14:41:38 - Generating a feature importance plot using SKATER for Model 3 ... 21-Oct-21 14:41:38 - Initializing Skater - generating new in-memory model. This operation may be time-consuming so please be patient. 2021-10-21 14:41:39,056 - skater.core.explanations - WARNING - Progress bars slow down runs by 10-20%. For slightly faster runs, do progress_bar=False
[19/19] features ████████████████████ Time elapsed: 1 seconds
_ = generate_feature_importance_plot(FeatureImportanceType.SKATER, model_4)
21-Oct-21 14:41:41 - Generating a feature importance plot using SKATER for Model 4 ... 21-Oct-21 14:41:41 - Initializing Skater - generating new in-memory model. This operation may be time-consuming so please be patient. 2021-10-21 14:41:41,421 - skater.core.explanations - WARNING - Progress bars slow down runs by 10-20%. For slightly faster runs, do progress_bar=False
[19/19] features ████████████████████ Time elapsed: 1 seconds
print('\n' + generate_feature_importance_explanation(FeatureImportanceType.SKATER, models, 4))
21-Oct-21 14:41:43 - Generating feature importance explanation for SKATER ... 2021-10-21 14:41:43,925 - skater.core.explanations - WARNING - Progress bars slow down runs by 10-20%. For slightly faster runs, do progress_bar=False
[19/19] features ████████████████████ Time elapsed: 1 seconds
2021-10-21 14:41:45,828 - skater.core.explanations - WARNING - Progress bars slow down runs by 10-20%. For slightly faster runs, do progress_bar=False
[19/19] features ████████████████████ Time elapsed: 1 seconds
2021-10-21 14:41:47,009 - skater.core.explanations - WARNING - Progress bars slow down runs by 10-20%. For slightly faster runs, do progress_bar=False
[19/19] features ████████████████████ Time elapsed: 1 seconds
2021-10-21 14:41:48,404 - skater.core.explanations - WARNING - Progress bars slow down runs by 10-20%. For slightly faster runs, do progress_bar=False
[19/19] features ████████████████████ Time elapsed: 1 seconds Summary: The highest feature for Model 1 is STDs: Time since last diagnosis with weight ~0.113. The 2nd highest feature for Model 1 is STDs: Number of diagnosis with weight ~0.104. The 3rd highest feature for Model 1 is STDs: Time since first diagnosis with weight ~0.086. The 4th most important feature for Model 1 is Num of pregnancies with weight ~0.073. The best feature for Model 2 is Num of pregnancies with weight ~0.211, same as 4th for Model 1. The 2nd most valuable feature for Model 2 is Age with weight ~0.179. The 3rd best feature for Model 2 is Hormonal Contraceptives (years) with weight ~0.172. The 4th most important feature for Model 2 is First sexual intercourse with weight ~0.157. The highest feature for Model 3 is Age with weight ~0.179, identical to 2nd for Model 2. The 2nd most influential feature for Model 3 is Num of pregnancies with weight ~0.155, same as 4th for Model 1. The 3rd best feature for Model 3 is Number of sexual partners with weight ~0.131. The 4th most influential feature for Model 3 is Hormonal Contraceptives (years) with weight ~0.117, similar to 3rd for Model 2. The best feature for Model 4 is Age with weight ~0.293, matching 2nd for Model 2. The 2nd highest feature for Model 4 is Hormonal Contraceptives (years) with weight ~0.139, matching 3rd for Model 2. The 3rd most important feature for Model 4 is First sexual intercourse with weight ~0.122, same as 4th for Model 2. The 4th highest feature for Model 4 is IUD (years) with weight ~0.084.
In the cell below we use the SHAP (SHapley Additive exPlanations). It uses a combination of feature contributions and game theory to come up with SHAP values. Then, it computes the global feature importance by taking the average of the SHAP value magnitudes across the dataset.
from shap import initjs
initjs()
%matplotlib inline
plt.style.use('ggplot')
warnings.filterwarnings('ignore')
generate_feature_importance_plot(FeatureImportanceType.SHAP, model_1)
21-Oct-21 14:41:50 - Generating a feature importance plot using SHAP for Model 1 ... 21-Oct-21 14:41:50 - Initializing Shap - calculating shap values. This operation is time-consuming so please be patient.
generate_feature_importance_plot(FeatureImportanceType.SHAP, model_2)
21-Oct-21 14:42:01 - Generating a feature importance plot using SHAP for Model 2 ... 21-Oct-21 14:42:01 - Initializing Shap - calculating shap values. This operation is time-consuming so please be patient.
generate_feature_importance_plot(FeatureImportanceType.SHAP, model_3)
21-Oct-21 14:42:13 - Generating a feature importance plot using SHAP for Model 3 ... 21-Oct-21 14:42:13 - Initializing Shap - calculating shap values. This operation is time-consuming so please be patient.
generate_feature_importance_plot(FeatureImportanceType.SHAP, model_4)
21-Oct-21 14:42:30 - Generating a feature importance plot using SHAP for Model 4 ... 21-Oct-21 14:42:30 - Initializing Shap - calculating shap values. This operation is time-consuming so please be patient.
print(generate_feature_importance_explanation(FeatureImportanceType.SHAP, models, 4))
21-Oct-21 14:43:02 - Generating feature importance explanation for SHAP ...
Summary: The best feature for Model 1 is Hormonal Contraceptives (years) with weight ~0.05. The 2nd highest feature for Model 1 is Num of pregnancies with weight ~0.049. The 3rd most important feature for Model 1 is Smokes (years) with weight ~0.046. The 4th most valuable feature for Model 1 is Age with weight ~0.042. The highest feature for Model 2 is Hormonal Contraceptives (years) with weight ~0.295, alike 1st for Model 1. The 2nd best feature for Model 2 is Number of sexual partners with weight ~0.153. The 3rd best feature for Model 2 is First sexual intercourse with weight ~0.125. The 4th most important feature for Model 2 is Age with weight ~0.123, similar to 4th for Model 1. The highest feature for Model 3 is Hormonal Contraceptives (years) with weight ~0.116, identical to 1st for Model 1. The 2nd most influential feature for Model 3 is Num of pregnancies with weight ~0.077, same as 2nd for Model 1. The 3rd most influential feature for Model 3 is Age with weight ~0.073, similar to 4th for Model 1. The 4th most valuable feature for Model 3 is Number of sexual partners with weight ~0.049, same as 2nd for Model 2. The most valuable feature for Model 4 is Age with weight ~0.001, similar to 4th for Model 1. The 2nd most valuable feature for Model 4 is Hormonal Contraceptives (years) with weight ~0.0, alike 1st for Model 1. The 3rd most valuable feature for Model 4 is IUD (years) with weight ~0.0. The 4th best feature for Model 4 is First sexual intercourse with weight ~0.0, matching 3rd for Model 2.
The partial dependence plot (short PDP or PD plot) shows the marginal effect one or two features have on the predicted outcome of a machine learning model. A partial dependence plot can show whether the relationship between the target and a feature is linear, monotonic or more complex. For example, when applied to a linear regression model, partial dependence plots always show a linear relationship.
PDPBox is the first module that we use for ploting partial dependence.
generate_pdp_plots(PDPType.PDPBox, model_1, "Age", "None")
generate_pdp_plots(PDPType.PDPBox, model_1, "Age", "Number of sexual partners")
21-Oct-21 14:43:02 - Generating a PDP plot using PDPBox for Model 1 ... 21-Oct-21 14:43:03 - Generating a PDP plot using PDPBox for Model 1 ...
generate_pdp_plots(PDPType.PDPBox, model_2, "Age", "None")
generate_pdp_plots(PDPType.PDPBox, model_2, "Age", "Number of sexual partners")
21-Oct-21 14:43:05 - Generating a PDP plot using PDPBox for Model 2 ... 21-Oct-21 14:43:06 - Generating a PDP plot using PDPBox for Model 2 ...
generate_pdp_plots(PDPType.PDPBox, model_3, "Age", "None")
generate_pdp_plots(PDPType.PDPBox, model_3, "Age", "Number of sexual partners")
21-Oct-21 14:43:08 - Generating a PDP plot using PDPBox for Model 3 ... 21-Oct-21 14:43:09 - Generating a PDP plot using PDPBox for Model 3 ...
generate_pdp_plots(PDPType.PDPBox, model_4, "Age", "None")
generate_pdp_plots(PDPType.PDPBox, model_4, "Age", "Number of sexual partners")
21-Oct-21 14:43:12 - Generating a PDP plot using PDPBox for Model 4 ... 21-Oct-21 14:43:13 - Generating a PDP plot using PDPBox for Model 4 ...
generate_pdp_plots(PDPType.SKATER, model_1, "Age", "Number of sexual partners")
21-Oct-21 14:43:19 - Generating a PDP plot using SKATER for Model 1 ... 2021-10-21 14:43:19,804 - skater.core.explanations - WARNING - Progress bars slow down runs by 10-20%. For slightly faster runs, do progressbar=False
[468/468] grid cells ████████████████████ Time elapsed: 77 seconds
generate_pdp_plots(PDPType.SKATER, model_2, "Age", "Number of sexual partners")
21-Oct-21 14:44:38 - Generating a PDP plot using SKATER for Model 2 ... 2021-10-21 14:44:38,777 - skater.core.explanations - WARNING - Progress bars slow down runs by 10-20%. For slightly faster runs, do progressbar=False
[468/468] grid cells ████████████████████ Time elapsed: 44 seconds
generate_pdp_plots(PDPType.SKATER, model_3, "Age", "Number of sexual partners")
21-Oct-21 14:45:24 - Generating a PDP plot using SKATER for Model 3 ... 2021-10-21 14:45:24,716 - skater.core.explanations - WARNING - Progress bars slow down runs by 10-20%. For slightly faster runs, do progressbar=False
[468/468] grid cells ████████████████████ Time elapsed: 45 seconds
generate_pdp_plots(PDPType.SKATER, model_4, "Age", "Number of sexual partners")
21-Oct-21 14:46:11 - Generating a PDP plot using SKATER for Model 4 ... 2021-10-21 14:46:12,003 - skater.core.explanations - WARNING - Progress bars slow down runs by 10-20%. For slightly faster runs, do progressbar=False
[468/468] grid cells ████████████████████ Time elapsed: 65 seconds
generate_pdp_plots(PDPType.SHAP, model_1, "Age", "Number of sexual partners")
21-Oct-21 14:47:18 - Generating a PDP plot using SHAP for Model 1 ...
generate_pdp_plots(PDPType.SHAP, model_2, "Age", "Number of sexual partners")
21-Oct-21 14:47:18 - Generating a PDP plot using SHAP for Model 2 ...
generate_pdp_plots(PDPType.SHAP, model_3, "Age", "Number of sexual partners")
21-Oct-21 14:47:19 - Generating a PDP plot using SHAP for Model 3 ...
generate_pdp_plots(PDPType.SHAP, model_4, "Age", "Number of sexual partners")
21-Oct-21 14:47:19 - Generating a PDP plot using SHAP for Model 4 ...
Local interpretation focuses on specifics of each individual and provides explanations that can lead to a better understanding of the feature contribution in smaller groups of individuals that are often overlooked by the global interpretation techniques. We will use two moduels for interpreting single instances - SHAP and LIME.
SHAP leverages the idea of Shapley values for model feature influence scoring. The technical definition of a Shapley value is the “average marginal contribution of a feature value over all possible coalitions.” In other words, Shapley values consider all possible predictions for an instance using all possible combinations of inputs. Because of this exhaustive approach, SHAP can guarantee properties like consistency and local accuracy. LIME, on the other hand, does not offer such guarantees.
LIME (Local Interpretable Model-agnostic Explanations) builds sparse linear models around each prediction to explain how the black box model works in that local vicinity. While treating the model as a black box, we perturb the instance we want to explain and learn a sparse linear model around it, as an explanation. LIME has the advantage over SHAP, that it is a lot faster.
examples = []
example_types = [ExampleType.FALSELY_CLASSIFIED]
for example_type in example_types:
for model in models:
example = get_test_examples(model, example_type, 1)[0]
while example in examples:
example = get_test_examples(model, example_type, 1)[0]
examples.append(example)
display(examples)
[126, 159, 75, 102]
example = examples[0]
print(get_example_information(model_1, example))
print(generate_single_instance_comparison(models, example))
Example 126's data: Age 27 Number of sexual partners 2.0 First sexual intercourse 18.0 Num of pregnancies 0.0 Smokes 0.0 Smokes (years) 0.0 Hormonal Contraceptives 0.0 Hormonal Contraceptives (years) 0.0 IUD 1.0 IUD (years) 2.0 STDs 0.0 STDs (number) 0.0 STDs: Number of diagnosis 0 STDs: Time since first diagnosis 0.0 STDs: Time since last diagnosis 0.0 Name: 167, dtype: object Actual result for example 126: 0 Example 126 was truly classified by Model 2, Model 3, Model 4 and falsely classified by Model 1. For further clarification see the explanations below.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_1, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_1, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_1, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_1, example))
display(explanation)
21-Oct-21 14:47:20 - Generating a single instance explanation using LIME for Model 1 ... 21-Oct-21 14:47:20 - Initializing LIME - generating new explainer. This operation may be time-consuming so please be patient. 21-Oct-21 14:47:26 - Generating a single instance explanation using LIME for Model 1 ...
The prediction probability of Model 1's decision for this example is 0.66. LIME's explanation: The feature that mainly changes Model 1's positive (1) prediction probability is STDs: Time since first diagnosis <= 0.00 with value of 0.3302. The feature with the second most considerable change on Model 1's positive (1) prediction probability is IUD=1.0 with value of 0.1973. The third most influential feature for the positive (1) prediction probability of Model 1 is STDs (number) <= 0.00 with value of 0.1958 The feature that largely impacts Model 1's negative (0) prediction probability is STDs: Time since last diagnosis <= 0.00 with value of -0.324. The feature with the second most substantial change on Model 1's negative (0) prediction probability is STDs: Number of diagnosis <= 0.00 with value of -0.2472.
21-Oct-21 14:47:33 - Generating a single instance explanation using SHAP for Model 1 ...
The prediction probability of Model 1's decision for this example is 0.66. SHAP's explanation: The feature that primarily changes Model 1's positive (1) prediction probability is IUD_0.0 with value of 0.1182. The feature with the second most substantial affect on Model 1's positive (1) prediction probability is IUD_1.0 with value of 0.1106. The third most impactful feature for the positive (1) prediction probability of Model 1 is Num of pregnancies with value of 0.0453 The feature that mainly affects Model 1's negative (0) prediction probability is Hormonal Contraceptives_0.0 with value of -0.0334. The feature with the second biggest influence on Model 1's negative (0) prediction probability is Hormonal Contraceptives_1.0 with value of -0.0261.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_2, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_2, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_2, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_2, example))
display(explanation)
21-Oct-21 14:47:33 - Generating a single instance explanation using LIME for Model 2 ... 21-Oct-21 14:47:33 - Initializing LIME - generating new explainer. This operation may be time-consuming so please be patient. 21-Oct-21 14:47:40 - Generating a single instance explanation using LIME for Model 2 ...
The prediction probability of Model 2's decision for this example is 1.0. LIME's explanation: The feature that primarily impacts Model 2's positive (1) prediction probability is 17.00 < First sexual intercourse <= 18.00 with value of 0.0809. The feature with the second largest change on Model 2's positive (1) prediction probability is Hormonal Contraceptives (years) <= 0.00 with value of 0.0654. The third most influential feature for the positive (1) prediction probability of Model 2 is IUD=1.0 with value of 0.015 The feature that mostly affects Model 2's negative (0) prediction probability is STDs: Time since last diagnosis <= 0.00 with value of -0.141. The feature with the second most substantial change on Model 2's negative (0) prediction probability is Num of pregnancies <= 1.00 with value of -0.0381.
21-Oct-21 14:47:47 - Generating a single instance explanation using SHAP for Model 2 ...
The prediction probability of Model 2's decision for this example is 1.0. SHAP's explanation: The feature that mostly influences Model 2's positive (1) prediction probability is Hormonal Contraceptives_1.0 with value of 0.1667. The feature with the second most considerable influence on Model 2's positive (1) prediction probability is Num of pregnancies with value of 0.1667. The feature that largely affects Model 2's negative (0) prediction probability is Hormonal Contraceptives (years) with value of -0.3333.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_3, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_3, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_3, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_3, example))
display(explanation)
21-Oct-21 14:47:47 - Generating a single instance explanation using LIME for Model 3 ... 21-Oct-21 14:47:47 - Initializing LIME - generating new explainer. This operation may be time-consuming so please be patient. 21-Oct-21 14:47:53 - Generating a single instance explanation using LIME for Model 3 ...
The prediction probability of Model 3's decision for this example is 1.0. LIME's explanation: The feature that mainly influences Model 3's positive (1) prediction probability is IUD (years) > 0.00 with value of 0.0253. The feature with the second largest influence on Model 3's positive (1) prediction probability is STDs (number) <= 0.00 with value of 0.0211. The third most influential feature for the positive (1) prediction probability of Model 3 is Smokes (years) <= 0.00 with value of 0.02 The feature that mainly impacts Model 3's negative (0) prediction probability is STDs: Time since first diagnosis <= 0.00 with value of -0.0654. The feature with the second largest influence on Model 3's negative (0) prediction probability is Hormonal Contraceptives=0.0 with value of -0.0275.
21-Oct-21 14:48:00 - Generating a single instance explanation using SHAP for Model 3 ...
The prediction probability of Model 3's decision for this example is 1.0. SHAP's explanation: The feature that largely affects Model 3's positive (1) prediction probability is Number of sexual partners with value of 0.0233. The feature with the second biggest influence on Model 3's positive (1) prediction probability is First sexual intercourse with value of 0.0233. The third most important feature for the positive (1) prediction probability of Model 3 is IUD (years) with value of 0.0233 The feature that mostly influences Model 3's negative (0) prediction probability is Hormonal Contraceptives (years) with value of -0.05. The feature with the second most substantial impact on Model 3's negative (0) prediction probability is Hormonal Contraceptives_1.0 with value of -0.01.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_4, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_4, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_4, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_4, example))
display(explanation)
21-Oct-21 14:48:00 - Generating a single instance explanation using LIME for Model 4 ... 21-Oct-21 14:48:00 - Initializing LIME - generating new explainer. This operation may be time-consuming so please be patient. 21-Oct-21 14:48:07 - Generating a single instance explanation using LIME for Model 4 ...
The prediction probability of Model 4's decision for this example is 0.94. LIME's explanation: The feature that primarily changes Model 4's positive (1) prediction probability is IUD (years) > 0.00 with value of 0.0009. The feature with the second largest impact on Model 4's positive (1) prediction probability is IUD=1.0 with value of 0.0002. The third most effective feature for the positive (1) prediction probability of Model 4 is 17.00 < First sexual intercourse <= 18.00 with value of 0.0001 The feature that primarily impacts Model 4's negative (0) prediction probability is STDs: Time since last diagnosis <= 0.00 with value of -0.0008. The feature with the second most considerable influence on Model 4's negative (0) prediction probability is STDs: Time since first diagnosis <= 0.00 with value of -0.0006.
21-Oct-21 14:48:15 - Generating a single instance explanation using SHAP for Model 4 ...
The prediction probability of Model 4's decision for this example is 0.94. SHAP's explanation: The feature that mostly impacts Model 4's positive (1) prediction probability is Hormonal Contraceptives (years) with value of 0.0001. The feature with the second largest affect on Model 4's positive (1) prediction probability is Hormonal Contraceptives_0.0 with value of 0.0. The third most impactful feature for the positive (1) prediction probability of Model 4 is Hormonal Contraceptives_1.0 with value of 0.0 The feature that primarily affects Model 4's negative (0) prediction probability is IUD (years) with value of -0.0002. The feature with the second most substantial change on Model 4's negative (0) prediction probability is IUD_1.0 with value of -0.0001.
example = examples[1]
print(get_example_information(model_1, example))
print(generate_single_instance_comparison(models, example))
Example 159's data: Age 35 Number of sexual partners 3.0 First sexual intercourse 20.0 Num of pregnancies 2.0 Smokes 0.0 Smokes (years) 0.0 Hormonal Contraceptives 0.0 Hormonal Contraceptives (years) 0.0 IUD 1.0 IUD (years) 10.0 STDs 1.0 STDs (number) 2.0 STDs: Number of diagnosis 1 STDs: Time since first diagnosis 3.0 STDs: Time since last diagnosis 3.0 Name: 68, dtype: object Actual result for example 159: 0 Example 159 was truly classified by Model 3 and falsely classified by Model 1, Model 2, Model 4. For further clarification see the explanations below.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_1, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_1, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_1, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_1, example))
display(explanation)
21-Oct-21 14:48:15 - Generating a single instance explanation using LIME for Model 1 ... 21-Oct-21 14:48:22 - Generating a single instance explanation using LIME for Model 1 ...
The prediction probability of Model 1's decision for this example is 0.8. LIME's explanation: The feature that mostly changes Model 1's positive (1) prediction probability is STDs: Time since last diagnosis > 0.00 with value of 0.3175. The feature with the second most substantial affect on Model 1's positive (1) prediction probability is STDs: Number of diagnosis > 0.00 with value of 0.2577. The third most effective feature for the positive (1) prediction probability of Model 1 is STDs=1.0 with value of 0.2074 The feature that mainly affects Model 1's negative (0) prediction probability is STDs: Time since first diagnosis > 0.00 with value of -0.3276. The feature with the second most substantial change on Model 1's negative (0) prediction probability is STDs (number) > 0.00 with value of -0.1963.
21-Oct-21 14:48:29 - Generating a single instance explanation using SHAP for Model 1 ...
The prediction probability of Model 1's decision for this example is 0.8. SHAP's explanation: The feature that mainly impacts Model 1's positive (1) prediction probability is STDs: Number of diagnosis with value of 0.2485. The feature with the second most substantial influence on Model 1's positive (1) prediction probability is STDs: Time since last diagnosis with value of 0.2257. The third most impactful feature for the positive (1) prediction probability of Model 1 is STDs_0.0 with value of 0.0986 The feature that mainly influences Model 1's negative (0) prediction probability is STDs: Time since first diagnosis with value of -0.2238. The feature with the second most considerable change on Model 1's negative (0) prediction probability is STDs (number) with value of -0.2043.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_2, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_2, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_2, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_2, example))
display(explanation)
21-Oct-21 14:48:29 - Generating a single instance explanation using LIME for Model 2 ... 21-Oct-21 14:48:35 - Generating a single instance explanation using LIME for Model 2 ...
The prediction probability of Model 2's decision for this example is 1.0. LIME's explanation: The feature that primarily influences Model 2's positive (1) prediction probability is STDs: Time since last diagnosis > 0.00 with value of 0.1096. The feature with the second biggest influence on Model 2's positive (1) prediction probability is Hormonal Contraceptives (years) <= 0.00 with value of 0.0523. The third most impactful feature for the positive (1) prediction probability of Model 2 is Smokes (years) <= 0.00 with value of 0.0182 The feature that mainly changes Model 2's negative (0) prediction probability is First sexual intercourse > 18.00 with value of -0.0654. The feature with the second biggest influence on Model 2's negative (0) prediction probability is Hormonal Contraceptives=0.0 with value of -0.0404.
21-Oct-21 14:48:42 - Generating a single instance explanation using SHAP for Model 2 ...
The prediction probability of Model 2's decision for this example is 1.0. SHAP's explanation: The feature that mostly impacts Model 2's positive (1) prediction probability is Hormonal Contraceptives (years) with value of 0.7111. The feature with the second most considerable change on Model 2's positive (1) prediction probability is STDs: Time since last diagnosis with value of 0.4351. The feature that primarily changes Model 2's negative (0) prediction probability is IUD (years) with value of -0.0603. The feature with the second largest change on Model 2's negative (0) prediction probability is First sexual intercourse with value of -0.0562.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_3, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_3, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_3, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_3, example))
display(explanation)
21-Oct-21 14:48:42 - Generating a single instance explanation using LIME for Model 3 ... 21-Oct-21 14:48:49 - Generating a single instance explanation using LIME for Model 3 ...
The prediction probability of Model 3's decision for this example is 0.8. LIME's explanation: The feature that primarily changes Model 3's positive (1) prediction probability is STDs: Time since first diagnosis > 0.00 with value of 0.053. The feature with the second biggest affect on Model 3's positive (1) prediction probability is IUD (years) > 0.00 with value of 0.0188. The third most important feature for the positive (1) prediction probability of Model 3 is STDs: Number of diagnosis > 0.00 with value of 0.0179 The feature that mainly affects Model 3's negative (0) prediction probability is STDs (number) > 0.00 with value of -0.0277. The feature with the second largest change on Model 3's negative (0) prediction probability is Hormonal Contraceptives=0.0 with value of -0.0233.
21-Oct-21 14:48:55 - Generating a single instance explanation using SHAP for Model 3 ...
The prediction probability of Model 3's decision for this example is 0.8. SHAP's explanation: The feature that mostly affects Model 3's positive (1) prediction probability is IUD_1.0 with value of 0.0205. The feature with the second largest influence on Model 3's positive (1) prediction probability is Hormonal Contraceptives_1.0 with value of 0.0184. The third most important feature for the positive (1) prediction probability of Model 3 is IUD_0.0 with value of 0.018 The feature that mostly influences Model 3's negative (0) prediction probability is Hormonal Contraceptives (years) with value of -0.0911. The feature with the second most substantial affect on Model 3's negative (0) prediction probability is STDs: Number of diagnosis with value of -0.0515.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_4, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_4, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_4, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_4, example))
display(explanation)
21-Oct-21 14:48:55 - Generating a single instance explanation using LIME for Model 4 ... 21-Oct-21 14:49:03 - Generating a single instance explanation using LIME for Model 4 ...
The prediction probability of Model 4's decision for this example is 0.93. LIME's explanation: The feature that mainly affects Model 4's positive (1) prediction probability is Age > 31.75 with value of 0.0011. The feature with the second largest influence on Model 4's positive (1) prediction probability is IUD (years) > 0.00 with value of 0.0008. The third most important feature for the positive (1) prediction probability of Model 4 is STDs: Time since last diagnosis > 0.00 with value of 0.0007 The feature that mostly changes Model 4's negative (0) prediction probability is Smokes (years) <= 0.00 with value of -0.0004. The feature with the second most considerable influence on Model 4's negative (0) prediction probability is Hormonal Contraceptives (years) <= 0.00 with value of -0.0003.
21-Oct-21 14:49:11 - Generating a single instance explanation using SHAP for Model 4 ...
The prediction probability of Model 4's decision for this example is 0.93. SHAP's explanation: The feature that primarily impacts Model 4's negative (0) prediction probability is IUD (years) with value of -0.0015. The feature with the second biggest influence on Model 4's negative (0) prediction probability is Age with value of -0.0005.
example = examples[2]
print(get_example_information(model_1, example))
print(generate_single_instance_comparison(models, example))
Example 75's data: Age 51 Number of sexual partners 3.0 First sexual intercourse 17.0 Num of pregnancies 6.0 Smokes 1.0 Smokes (years) 34.0 Hormonal Contraceptives 0.0 Hormonal Contraceptives (years) 0.0 IUD 1.0 IUD (years) 7.0 STDs 0.0 STDs (number) 0.0 STDs: Number of diagnosis 0 STDs: Time since first diagnosis 0.0 STDs: Time since last diagnosis 0.0 Name: 6, dtype: object Actual result for example 75: 1 Example 75 was truly classified by Model 1 and falsely classified by Model 2, Model 3, Model 4. For further clarification see the explanations below.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_1, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_1, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_1, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_1, example))
display(explanation)
21-Oct-21 14:49:11 - Generating a single instance explanation using LIME for Model 1 ... 21-Oct-21 14:49:18 - Generating a single instance explanation using LIME for Model 1 ...
The prediction probability of Model 1's decision for this example is 0.77. LIME's explanation: The feature that largely impacts Model 1's positive (1) prediction probability is STDs: Time since first diagnosis <= 0.00 with value of 0.3373. The feature with the second largest change on Model 1's positive (1) prediction probability is STDs (number) <= 0.00 with value of 0.1979. The third most impactful feature for the positive (1) prediction probability of Model 1 is IUD=1.0 with value of 0.1903 The feature that primarily changes Model 1's negative (0) prediction probability is STDs: Time since last diagnosis <= 0.00 with value of -0.3269. The feature with the second most substantial influence on Model 1's negative (0) prediction probability is STDs: Number of diagnosis <= 0.00 with value of -0.2622.
21-Oct-21 14:49:24 - Generating a single instance explanation using SHAP for Model 1 ...
The prediction probability of Model 1's decision for this example is 0.77. SHAP's explanation: The feature that largely impacts Model 1's positive (1) prediction probability is Smokes (years) with value of 0.386. The feature with the second most considerable change on Model 1's positive (1) prediction probability is IUD_0.0 with value of 0.0989. The third most impactful feature for the positive (1) prediction probability of Model 1 is IUD_1.0 with value of 0.0926 The feature that mostly changes Model 1's negative (0) prediction probability is Smokes_1.0 with value of -0.0938. The feature with the second biggest change on Model 1's negative (0) prediction probability is Smokes_0.0 with value of -0.0879.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_2, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_2, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_2, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_2, example))
display(explanation)
21-Oct-21 14:49:24 - Generating a single instance explanation using LIME for Model 2 ... 21-Oct-21 14:49:31 - Generating a single instance explanation using LIME for Model 2 ...
The prediction probability of Model 2's decision for this example is 1.0. LIME's explanation: The feature that largely changes Model 2's positive (1) prediction probability is Hormonal Contraceptives (years) <= 0.00 with value of 0.0548. The feature with the second largest affect on Model 2's positive (1) prediction probability is Num of pregnancies > 3.00 with value of 0.0509. The third most impactful feature for the positive (1) prediction probability of Model 2 is 15.00 < First sexual intercourse <= 17.00 with value of 0.0271 The feature that mainly affects Model 2's negative (0) prediction probability is STDs: Time since last diagnosis <= 0.00 with value of -0.1156. The feature with the second largest impact on Model 2's negative (0) prediction probability is Hormonal Contraceptives=0.0 with value of -0.0405.
21-Oct-21 14:49:38 - Generating a single instance explanation using SHAP for Model 2 ...
The prediction probability of Model 2's decision for this example is 1.0. SHAP's explanation: The feature that mostly impacts Model 2's positive (1) prediction probability is Age with value of 0.0833. The feature with the second largest affect on Model 2's positive (1) prediction probability is Hormonal Contraceptives_1.0 with value of 0.0833. The third most important feature for the positive (1) prediction probability of Model 2 is IUD (years) with value of 0.0833 The feature that largely affects Model 2's negative (0) prediction probability is Hormonal Contraceptives (years) with value of -0.25.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_3, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_3, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_3, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_3, example))
display(explanation)
21-Oct-21 14:49:38 - Generating a single instance explanation using LIME for Model 3 ... 21-Oct-21 14:49:45 - Generating a single instance explanation using LIME for Model 3 ...
The prediction probability of Model 3's decision for this example is 0.8. LIME's explanation: The feature that mainly changes Model 3's positive (1) prediction probability is IUD (years) > 0.00 with value of 0.0246. The feature with the second most considerable affect on Model 3's positive (1) prediction probability is STDs (number) <= 0.00 with value of 0.0222. The third most impactful feature for the positive (1) prediction probability of Model 3 is 2.00 < Number of sexual partners <= 3.00 with value of 0.0139 The feature that mainly influences Model 3's negative (0) prediction probability is STDs: Time since first diagnosis <= 0.00 with value of -0.0731. The feature with the second largest change on Model 3's negative (0) prediction probability is Hormonal Contraceptives=0.0 with value of -0.0257.
21-Oct-21 14:49:52 - Generating a single instance explanation using SHAP for Model 3 ...
The prediction probability of Model 3's decision for this example is 0.8. SHAP's explanation: The feature that mostly changes Model 3's positive (1) prediction probability is Smokes_0.0 with value of 0.0367. The feature with the second largest affect on Model 3's positive (1) prediction probability is Hormonal Contraceptives_1.0 with value of 0.0067. The third most important feature for the positive (1) prediction probability of Model 3 is IUD_1.0 with value of 0.0067 The feature that primarily affects Model 3's negative (0) prediction probability is IUD (years) with value of -0.0633. The feature with the second most considerable affect on Model 3's negative (0) prediction probability is Age with value of -0.0567.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_4, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_4, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_4, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_4, example))
display(explanation)
21-Oct-21 14:49:52 - Generating a single instance explanation using LIME for Model 4 ... 21-Oct-21 14:49:59 - Generating a single instance explanation using LIME for Model 4 ...
The prediction probability of Model 4's decision for this example is 0.94. LIME's explanation: The feature that primarily affects Model 4's positive (1) prediction probability is Age > 31.75 with value of 0.0011. The feature with the second largest influence on Model 4's positive (1) prediction probability is IUD (years) > 0.00 with value of 0.0007. The third most impactful feature for the positive (1) prediction probability of Model 4 is Smokes (years) > 0.00 with value of 0.0004 The feature that largely changes Model 4's negative (0) prediction probability is STDs: Time since last diagnosis <= 0.00 with value of -0.0007. The feature with the second most substantial change on Model 4's negative (0) prediction probability is STDs: Time since first diagnosis <= 0.00 with value of -0.0006.
21-Oct-21 14:50:07 - Generating a single instance explanation using SHAP for Model 4 ...
The prediction probability of Model 4's decision for this example is 0.94. SHAP's explanation: The feature that largely impacts Model 4's positive (1) prediction probability is Hormonal Contraceptives (years) with value of 0.0002. The feature with the second largest change on Model 4's positive (1) prediction probability is Num of pregnancies with value of 0.0001. The third most important feature for the positive (1) prediction probability of Model 4 is Hormonal Contraceptives_1.0 with value of 0.0 The feature that primarily changes Model 4's negative (0) prediction probability is Smokes (years) with value of -0.0007. The feature with the second most considerable impact on Model 4's negative (0) prediction probability is IUD (years) with value of -0.0007.
example = examples[3]
print(get_example_information(model_1, example))
print(generate_single_instance_comparison(models, example))
Example 102's data: Age 43 Number of sexual partners 2.0 First sexual intercourse 18.0 Num of pregnancies 5.0 Smokes 0.0 Smokes (years) 0.0 Hormonal Contraceptives 0.0 Hormonal Contraceptives (years) 0.0 IUD 1.0 IUD (years) 8.0 STDs 0.0 STDs (number) 0.0 STDs: Number of diagnosis 0 STDs: Time since first diagnosis 0.0 STDs: Time since last diagnosis 0.0 Name: 14, dtype: object Actual result for example 102: 0 Example 102 was truly classified by no model and falsely classified by Model 1, Model 2, Model 3, Model 4. For further clarification see the explanations below.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_1, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_1, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_1, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_1, example))
display(explanation)
21-Oct-21 14:50:07 - Generating a single instance explanation using LIME for Model 1 ... 21-Oct-21 14:50:14 - Generating a single instance explanation using LIME for Model 1 ...
The prediction probability of Model 1's decision for this example is 0.56. LIME's explanation: The feature that primarily changes Model 1's positive (1) prediction probability is STDs: Time since first diagnosis <= 0.00 with value of 0.3279. The feature with the second most considerable change on Model 1's positive (1) prediction probability is IUD=1.0 with value of 0.1965. The third most important feature for the positive (1) prediction probability of Model 1 is STDs (number) <= 0.00 with value of 0.1934 The feature that mainly influences Model 1's negative (0) prediction probability is STDs: Time since last diagnosis <= 0.00 with value of -0.3209. The feature with the second most considerable change on Model 1's negative (0) prediction probability is STDs: Number of diagnosis <= 0.00 with value of -0.2641.
21-Oct-21 14:50:21 - Generating a single instance explanation using SHAP for Model 1 ...
The prediction probability of Model 1's decision for this example is 0.56. SHAP's explanation: The feature that primarily changes Model 1's positive (1) prediction probability is IUD_0.0 with value of 0.1198. The feature with the second most substantial affect on Model 1's positive (1) prediction probability is IUD_1.0 with value of 0.1122. The third most impactful feature for the positive (1) prediction probability of Model 1 is Age with value of 0.0511 The feature that primarily affects Model 1's negative (0) prediction probability is Num of pregnancies with value of -0.0687. The feature with the second most substantial affect on Model 1's negative (0) prediction probability is IUD (years) with value of -0.042.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_2, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_2, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_2, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_2, example))
display(explanation)
21-Oct-21 14:50:21 - Generating a single instance explanation using LIME for Model 2 ... 21-Oct-21 14:50:27 - Generating a single instance explanation using LIME for Model 2 ...
The prediction probability of Model 2's decision for this example is 1.0. LIME's explanation: The feature that largely changes Model 2's positive (1) prediction probability is Hormonal Contraceptives (years) <= 0.00 with value of 0.0664. The feature with the second most considerable affect on Model 2's positive (1) prediction probability is 17.00 < First sexual intercourse <= 18.00 with value of 0.0584. The third most influential feature for the positive (1) prediction probability of Model 2 is Num of pregnancies > 3.00 with value of 0.0288 The feature that mainly changes Model 2's negative (0) prediction probability is STDs: Time since last diagnosis <= 0.00 with value of -0.0973. The feature with the second largest impact on Model 2's negative (0) prediction probability is Age > 31.75 with value of -0.0458.
21-Oct-21 14:50:34 - Generating a single instance explanation using SHAP for Model 2 ...
The prediction probability of Model 2's decision for this example is 1.0. SHAP's explanation: The feature that mostly impacts Model 2's positive (1) prediction probability is Hormonal Contraceptives (years) with value of 0.5. The feature with the second most substantial impact on Model 2's positive (1) prediction probability is Num of pregnancies with value of 0.25. The third most important feature for the positive (1) prediction probability of Model 2 is First sexual intercourse with value of 0.25 The feature that mainly affects Model 2's negative (0) prediction probability is Age with value of -0.0833. The feature with the second biggest impact on Model 2's negative (0) prediction probability is Hormonal Contraceptives_1.0 with value of -0.0833.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_3, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_3, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_3, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_3, example))
display(explanation)
21-Oct-21 14:50:34 - Generating a single instance explanation using LIME for Model 3 ... 21-Oct-21 14:50:41 - Generating a single instance explanation using LIME for Model 3 ...
The prediction probability of Model 3's decision for this example is 0.6. LIME's explanation: The feature that mainly influences Model 3's positive (1) prediction probability is IUD (years) > 0.00 with value of 0.0383. The feature with the second biggest impact on Model 3's positive (1) prediction probability is STDs (number) <= 0.00 with value of 0.0215. The third most effective feature for the positive (1) prediction probability of Model 3 is Smokes (years) <= 0.00 with value of 0.0105 The feature that mainly influences Model 3's negative (0) prediction probability is STDs: Time since first diagnosis <= 0.00 with value of -0.0653. The feature with the second most substantial change on Model 3's negative (0) prediction probability is Hormonal Contraceptives=0.0 with value of -0.0324.
21-Oct-21 14:50:48 - Generating a single instance explanation using SHAP for Model 3 ...
The prediction probability of Model 3's decision for this example is 0.6. SHAP's explanation: The feature that primarily affects Model 3's positive (1) prediction probability is IUD (years) with value of 0.1367. The feature with the second most considerable influence on Model 3's positive (1) prediction probability is Age with value of 0.1348. The third most impactful feature for the positive (1) prediction probability of Model 3 is Hormonal Contraceptives (years) with value of 0.1114 The feature that primarily influences Model 3's negative (0) prediction probability is IUD_1.0 with value of -0.01. The feature with the second biggest influence on Model 3's negative (0) prediction probability is Hormonal Contraceptives_1.0 with value of -0.0086.
explanation = explain_single_instance(LocalInterpreterType.LIME, model_4, example)
print(generate_single_instance_explanation(LocalInterpreterType.LIME, model_4, example))
explanation.show_in_notebook(show_table=True, show_all=True)
explanation = explain_single_instance(LocalInterpreterType.SHAP, model_4, example)
print(generate_single_instance_explanation(LocalInterpreterType.SHAP, model_4, example))
display(explanation)
21-Oct-21 14:50:48 - Generating a single instance explanation using LIME for Model 4 ... 21-Oct-21 14:50:55 - Generating a single instance explanation using LIME for Model 4 ...
The prediction probability of Model 4's decision for this example is 0.93. LIME's explanation: The feature that primarily impacts Model 4's positive (1) prediction probability is Age > 31.75 with value of 0.0011. The feature with the second biggest change on Model 4's positive (1) prediction probability is IUD (years) > 0.00 with value of 0.0007. The third most influential feature for the positive (1) prediction probability of Model 4 is IUD=1.0 with value of 0.0001 The feature that primarily influences Model 4's negative (0) prediction probability is STDs: Time since last diagnosis <= 0.00 with value of -0.0007. The feature with the second most substantial influence on Model 4's negative (0) prediction probability is STDs: Time since first diagnosis <= 0.00 with value of -0.0006.
21-Oct-21 14:51:03 - Generating a single instance explanation using SHAP for Model 4 ...
The prediction probability of Model 4's decision for this example is 0.93. SHAP's explanation: The feature that mainly changes Model 4's positive (1) prediction probability is Hormonal Contraceptives (years) with value of 0.0002. The feature with the second biggest impact on Model 4's positive (1) prediction probability is Hormonal Contraceptives_1.0 with value of 0.0. The third most effective feature for the positive (1) prediction probability of Model 4 is Hormonal Contraceptives_0.0 with value of 0.0 The feature that mostly influences Model 4's negative (0) prediction probability is IUD (years) with value of -0.0013. The feature with the second most substantial affect on Model 4's negative (0) prediction probability is Age with value of -0.0011.